lib_list <- c("readxl", "dplyr", "ggplot2",
              "car", "multcomp", "caret",
              "vegan", "plotly", "DESeq2",
              "EnhancedVolcano")

load_libs <- function(lib_list) {
  installed_libs <- lib_list %in% rownames(installed.packages())
  if (any(installed_libs == F)) {
  install.packages(lib_list[!installed_libs])
    }
  invisible(lapply(lib_list, library, character.only = T))
}

load_libs(lib_list)
## Warning: пакет 'S4Vectors' был собран под R версии 4.1.3

1. Dataset description (5 points)

“Expression levels of 77 proteins measured in the cerebral cortex of 8 classes of control and Down syndrome mice exposed to context fear conditioning, a task used to assess associative learning.”

data = read_xls("../data/raw_data.xls", na = 'NA')
str(data)
## tibble [1,080 × 82] (S3: tbl_df/tbl/data.frame)
##  $ MouseID        : chr [1:1080] "309_1" "309_2" "309_3" "309_4" ...
##  $ DYRK1A_N       : num [1:1080] 0.504 0.515 0.509 0.442 0.435 ...
##  $ ITSN1_N        : num [1:1080] 0.747 0.689 0.73 0.617 0.617 ...
##  $ BDNF_N         : num [1:1080] 0.43 0.412 0.418 0.359 0.359 ...
##  $ NR1_N          : num [1:1080] 2.82 2.79 2.69 2.47 2.37 ...
##  $ NR2A_N         : num [1:1080] 5.99 5.69 5.62 4.98 4.72 ...
##  $ pAKT_N         : num [1:1080] 0.219 0.212 0.209 0.223 0.213 ...
##  $ pBRAF_N        : num [1:1080] 0.178 0.173 0.176 0.176 0.174 ...
##  $ pCAMKII_N      : num [1:1080] 2.37 2.29 2.28 2.15 2.13 ...
##  $ pCREB_N        : num [1:1080] 0.232 0.227 0.23 0.207 0.192 ...
##  $ pELK_N         : num [1:1080] 1.75 1.6 1.56 1.6 1.5 ...
##  $ pERK_N         : num [1:1080] 0.688 0.695 0.677 0.583 0.551 ...
##  $ pJNK_N         : num [1:1080] 0.306 0.299 0.291 0.297 0.287 ...
##  $ PKCA_N         : num [1:1080] 0.403 0.386 0.381 0.377 0.364 ...
##  $ pMEK_N         : num [1:1080] 0.297 0.281 0.282 0.314 0.278 ...
##  $ pNR1_N         : num [1:1080] 1.022 0.957 1.004 0.875 0.865 ...
##  $ pNR2A_N        : num [1:1080] 0.606 0.588 0.602 0.52 0.508 ...
##  $ pNR2B_N        : num [1:1080] 1.88 1.73 1.73 1.57 1.48 ...
##  $ pPKCAB_N       : num [1:1080] 2.31 2.04 2.02 2.13 2.01 ...
##  $ pRSK_N         : num [1:1080] 0.442 0.445 0.468 0.478 0.483 ...
##  $ AKT_N          : num [1:1080] 0.859 0.835 0.814 0.728 0.688 ...
##  $ BRAF_N         : num [1:1080] 0.416 0.4 0.4 0.386 0.368 ...
##  $ CAMKII_N       : num [1:1080] 0.37 0.356 0.368 0.363 0.355 ...
##  $ CREB_N         : num [1:1080] 0.179 0.174 0.174 0.179 0.175 ...
##  $ ELK_N          : num [1:1080] 1.87 1.76 1.77 1.29 1.32 ...
##  $ ERK_N          : num [1:1080] 3.69 3.49 3.57 2.97 2.9 ...
##  $ GSK3B_N        : num [1:1080] 1.54 1.51 1.5 1.42 1.36 ...
##  $ JNK_N          : num [1:1080] 0.265 0.256 0.26 0.26 0.251 ...
##  $ MEK_N          : num [1:1080] 0.32 0.304 0.312 0.279 0.274 ...
##  $ TRKA_N         : num [1:1080] 0.814 0.781 0.785 0.734 0.703 ...
##  $ RSK_N          : num [1:1080] 0.166 0.157 0.161 0.162 0.155 ...
##  $ APP_N          : num [1:1080] 0.454 0.431 0.423 0.411 0.399 ...
##  $ Bcatenin_N     : num [1:1080] 3.04 2.92 2.94 2.5 2.46 ...
##  $ SOD1_N         : num [1:1080] 0.37 0.342 0.344 0.345 0.329 ...
##  $ MTOR_N         : num [1:1080] 0.459 0.424 0.425 0.429 0.409 ...
##  $ P38_N          : num [1:1080] 0.335 0.325 0.325 0.33 0.313 ...
##  $ pMTOR_N        : num [1:1080] 0.825 0.762 0.757 0.747 0.692 ...
##  $ DSCR1_N        : num [1:1080] 0.577 0.545 0.544 0.547 0.537 ...
##  $ AMPKA_N        : num [1:1080] 0.448 0.421 0.405 0.387 0.361 ...
##  $ NR2B_N         : num [1:1080] 0.586 0.545 0.553 0.548 0.513 ...
##  $ pNUMB_N        : num [1:1080] 0.395 0.368 0.364 0.367 0.352 ...
##  $ RAPTOR_N       : num [1:1080] 0.34 0.322 0.313 0.328 0.312 ...
##  $ TIAM1_N        : num [1:1080] 0.483 0.455 0.447 0.443 0.419 ...
##  $ pP70S6_N       : num [1:1080] 0.294 0.276 0.257 0.399 0.393 ...
##  $ NUMB_N         : num [1:1080] 0.182 0.182 0.184 0.162 0.16 ...
##  $ P70S6_N        : num [1:1080] 0.843 0.848 0.856 0.76 0.768 ...
##  $ pGSK3B_N       : num [1:1080] 0.193 0.195 0.201 0.184 0.186 ...
##  $ pPKCG_N        : num [1:1080] 1.44 1.44 1.52 1.61 1.65 ...
##  $ CDK5_N         : num [1:1080] 0.295 0.294 0.302 0.296 0.297 ...
##  $ S6_N           : num [1:1080] 0.355 0.355 0.386 0.291 0.309 ...
##  $ ADARB1_N       : num [1:1080] 1.34 1.31 1.28 1.2 1.21 ...
##  $ AcetylH3K9_N   : num [1:1080] 0.17 0.171 0.185 0.16 0.165 ...
##  $ RRP1_N         : num [1:1080] 0.159 0.158 0.149 0.166 0.161 ...
##  $ BAX_N          : num [1:1080] 0.189 0.185 0.191 0.185 0.188 ...
##  $ ARC_N          : num [1:1080] 0.106 0.107 0.108 0.103 0.105 ...
##  $ ERBB4_N        : num [1:1080] 0.145 0.15 0.145 0.141 0.142 ...
##  $ nNOS_N         : num [1:1080] 0.177 0.178 0.176 0.164 0.168 ...
##  $ Tau_N          : num [1:1080] 0.125 0.134 0.133 0.123 0.137 ...
##  $ GFAP_N         : num [1:1080] 0.115 0.118 0.118 0.117 0.116 ...
##  $ GluR3_N        : num [1:1080] 0.228 0.238 0.245 0.235 0.256 ...
##  $ GluR4_N        : num [1:1080] 0.143 0.142 0.142 0.145 0.141 ...
##  $ IL1B_N         : num [1:1080] 0.431 0.457 0.51 0.431 0.481 ...
##  $ P3525_N        : num [1:1080] 0.248 0.258 0.255 0.251 0.252 ...
##  $ pCASP9_N       : num [1:1080] 1.6 1.67 1.66 1.48 1.53 ...
##  $ PSD95_N        : num [1:1080] 2.01 2 2.02 1.96 2.01 ...
##  $ SNCA_N         : num [1:1080] 0.108 0.11 0.108 0.12 0.12 ...
##  $ Ubiquitin_N    : num [1:1080] 1.045 1.01 0.997 0.99 0.998 ...
##  $ pGSK3B_Tyr216_N: num [1:1080] 0.832 0.849 0.847 0.833 0.879 ...
##  $ SHH_N          : num [1:1080] 0.189 0.2 0.194 0.192 0.206 ...
##  $ BAD_N          : num [1:1080] 0.123 0.117 0.119 0.133 0.13 ...
##  $ BCL2_N         : num [1:1080] NA NA NA NA NA NA NA NA NA NA ...
##  $ pS6_N          : num [1:1080] 0.106 0.107 0.108 0.103 0.105 ...
##  $ pCFOS_N        : num [1:1080] 0.108 0.104 0.106 0.111 0.111 ...
##  $ SYP_N          : num [1:1080] 0.427 0.442 0.436 0.392 0.434 ...
##  $ H3AcK18_N      : num [1:1080] 0.115 0.112 0.112 0.13 0.118 ...
##  $ EGR1_N         : num [1:1080] 0.132 0.135 0.133 0.147 0.14 ...
##  $ H3MeK4_N       : num [1:1080] 0.128 0.131 0.127 0.147 0.148 ...
##  $ CaNA_N         : num [1:1080] 1.68 1.74 1.93 1.7 1.84 ...
##  $ Genotype       : chr [1:1080] "Control" "Control" "Control" "Control" ...
##  $ Treatment      : chr [1:1080] "Memantine" "Memantine" "Memantine" "Memantine" ...
##  $ Behavior       : chr [1:1080] "C/S" "C/S" "C/S" "C/S" ...
##  $ class          : chr [1:1080] "c-CS-m" "c-CS-m" "c-CS-m" "c-CS-m" ...
data$Genotype <- as.factor(data$Genotype)
data$Treatment <- as.factor(data$Treatment)
data$Behavior <- as.factor(data$Behavior)
data$class <- as.factor(data$class)

So, here are 72 mice (38 control and 34 trisomic (Down syndrome)). In the experiments, 15 measurements were registered of each protein per sample/mouse.The dataset contains a total of 1080 measurements per protein. Each measurement can be considered as an independent sample/mouse.

We can find the following groups:

## 
## Control  Ts65Dn 
##     570     510
## 
## Memantine    Saline 
##       570       510
## 
## C/S S/C 
## 525 555

Some mice have been stimulated to learn (context-shock) and others have not (shock-context).

## 
## c-CS-m c-CS-s c-SC-m c-SC-s t-CS-m t-CS-s t-SC-m t-SC-s 
##    150    135    150    135    135    105    135    135

They are quite balanced

colSums(is.na(data))
##         MouseID        DYRK1A_N         ITSN1_N          BDNF_N           NR1_N 
##               0               3               3               3               3 
##          NR2A_N          pAKT_N         pBRAF_N       pCAMKII_N         pCREB_N 
##               3               3               3               3               3 
##          pELK_N          pERK_N          pJNK_N          PKCA_N          pMEK_N 
##               3               3               3               3               3 
##          pNR1_N         pNR2A_N         pNR2B_N        pPKCAB_N          pRSK_N 
##               3               3               3               3               3 
##           AKT_N          BRAF_N        CAMKII_N          CREB_N           ELK_N 
##               3               3               3               3              18 
##           ERK_N         GSK3B_N           JNK_N           MEK_N          TRKA_N 
##               3               3               3               7               3 
##           RSK_N           APP_N      Bcatenin_N          SOD1_N          MTOR_N 
##               3               3              18               3               3 
##           P38_N         pMTOR_N         DSCR1_N         AMPKA_N          NR2B_N 
##               3               3               3               3               3 
##         pNUMB_N        RAPTOR_N         TIAM1_N        pP70S6_N          NUMB_N 
##               3               3               3               3               0 
##         P70S6_N        pGSK3B_N         pPKCG_N          CDK5_N            S6_N 
##               0               0               0               0               0 
##        ADARB1_N    AcetylH3K9_N          RRP1_N           BAX_N           ARC_N 
##               0               0               0               0               0 
##         ERBB4_N          nNOS_N           Tau_N          GFAP_N         GluR3_N 
##               0               0               0               0               0 
##         GluR4_N          IL1B_N         P3525_N        pCASP9_N         PSD95_N 
##               0               0               0               0               0 
##          SNCA_N     Ubiquitin_N pGSK3B_Tyr216_N           SHH_N           BAD_N 
##               0               0               0               0             213 
##          BCL2_N           pS6_N         pCFOS_N           SYP_N       H3AcK18_N 
##             285               0              75               0             180 
##          EGR1_N        H3MeK4_N          CaNA_N        Genotype       Treatment 
##             210             270               0               0               0 
##        Behavior           class 
##               0               0

Missing values are found only in numeric columns with protein levels.

2. Are there any differences in the level of BDNF_N production depending on the experimental class? (10 points)

data_BDNF_without_na <- subset(data, !is.na(data$BDNF_N))
data_BDNF_without_na %>%
  group_by(class) %>%
  summarise(mean_BDNF_N = mean(BDNF_N),
            sd_BDNF_N = sd(BDNF_N))
## # A tibble: 8 × 3
##   class  mean_BDNF_N sd_BDNF_N
##   <fct>        <dbl>     <dbl>
## 1 c-CS-m       0.339    0.0469
## 2 c-CS-s       0.342    0.0549
## 3 c-SC-m       0.291    0.0386
## 4 c-SC-s       0.313    0.0436
## 5 t-CS-m       0.313    0.0512
## 6 t-CS-s       0.305    0.0433
## 7 t-SC-m       0.321    0.0332
## 8 t-SC-s       0.326    0.0575

Linear model with discrete predictor and ANOVA

mod_data <- lm(BDNF_N ~ class,
               data = data_BDNF_without_na)
summary(mod_data)
## 
## Call:
## lm(formula = BDNF_N ~ class, data = data_BDNF_without_na)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.175764 -0.028777 -0.001609  0.028701  0.159388 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.339217   0.003817  88.871  < 2e-16 ***
## classc-CS-s  0.003098   0.005546   0.559   0.5766    
## classc-SC-m -0.048272   0.005398  -8.942  < 2e-16 ***
## classc-SC-s -0.025825   0.005546  -4.657 3.62e-06 ***
## classt-CS-m -0.026485   0.005546  -4.776 2.04e-06 ***
## classt-CS-s -0.033757   0.005948  -5.675 1.78e-08 ***
## classt-SC-m -0.018154   0.005546  -3.273   0.0011 ** 
## classt-SC-s -0.013631   0.005579  -2.443   0.0147 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.04675 on 1069 degrees of freedom
## Multiple R-squared:  0.1097, Adjusted R-squared:  0.1039 
## F-statistic: 18.82 on 7 and 1069 DF,  p-value: < 2.2e-16
data_anova <- Anova(mod_data)
data_anova
## Anova Table (Type II tests)
## 
## Response: BDNF_N
##            Sum Sq   Df F value    Pr(>F)    
## class     0.28784    7  18.816 < 2.2e-16 ***
## Residuals 2.33619 1069                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The level of BDNF protein significantly depends on the experimental class (F = 18.8160539, p_value = 8.8566341^{-24}, df_1 = 7, df_2 = 1069).

post_hoc <- glht(mod_data,
                  linfct = mcp(class = "Tukey"))
result_hoc <- summary(post_hoc)
## Warning in RET$pfunction("adjusted", ...): Completion with error > abseps

## Warning in RET$pfunction("adjusted", ...): Completion with error > abseps
result_hoc
## 
##   Simultaneous Tests for General Linear Hypotheses
## 
## Multiple Comparisons of Means: Tukey Contrasts
## 
## 
## Fit: lm(formula = BDNF_N ~ class, data = data_BDNF_without_na)
## 
## Linear Hypotheses:
##                        Estimate Std. Error t value Pr(>|t|)    
## c-CS-s - c-CS-m == 0  0.0030979  0.0055459   0.559  0.99930    
## c-SC-m - c-CS-m == 0 -0.0482717  0.0053980  -8.942  < 0.001 ***
## c-SC-s - c-CS-m == 0 -0.0258249  0.0055459  -4.657  < 0.001 ***
## t-CS-m - c-CS-m == 0 -0.0264852  0.0055459  -4.776  < 0.001 ***
## t-CS-s - c-CS-m == 0 -0.0337570  0.0059483  -5.675  < 0.001 ***
## t-SC-m - c-CS-m == 0 -0.0181541  0.0055459  -3.273  0.02416 *  
## t-SC-s - c-CS-m == 0 -0.0136310  0.0055790  -2.443  0.22098    
## c-SC-m - c-CS-s == 0 -0.0513696  0.0055459  -9.263  < 0.001 ***
## c-SC-s - c-CS-s == 0 -0.0289228  0.0056900  -5.083  < 0.001 ***
## t-CS-m - c-CS-s == 0 -0.0295831  0.0056900  -5.199  < 0.001 ***
## t-CS-s - c-CS-s == 0 -0.0368549  0.0060829  -6.059  < 0.001 ***
## t-SC-m - c-CS-s == 0 -0.0212520  0.0056900  -3.735  0.00498 ** 
## t-SC-s - c-CS-s == 0 -0.0167289  0.0057223  -2.923  0.06882 .  
## c-SC-s - c-SC-m == 0  0.0224468  0.0055459   4.047  0.00136 ** 
## t-CS-m - c-SC-m == 0  0.0217865  0.0055459   3.928  0.00233 ** 
## t-CS-s - c-SC-m == 0  0.0145147  0.0059483   2.440  0.22245    
## t-SC-m - c-SC-m == 0  0.0301176  0.0055459   5.431  < 0.001 ***
## t-SC-s - c-SC-m == 0  0.0346406  0.0055790   6.209  < 0.001 ***
## t-CS-m - c-SC-s == 0 -0.0006603  0.0056900  -0.116  1.00000    
## t-CS-s - c-SC-s == 0 -0.0079321  0.0060829  -1.304  0.89711    
## t-SC-m - c-SC-s == 0  0.0076708  0.0056900   1.348  0.87977    
## t-SC-s - c-SC-s == 0  0.0121939  0.0057223   2.131  0.39514    
## t-CS-s - t-CS-m == 0 -0.0072718  0.0060829  -1.195  0.93312    
## t-SC-m - t-CS-m == 0  0.0083311  0.0056900   1.464  0.82594    
## t-SC-s - t-CS-m == 0  0.0128542  0.0057223   2.246  0.32368    
## t-SC-m - t-CS-s == 0  0.0156029  0.0060829   2.565  0.16934    
## t-SC-s - t-CS-s == 0  0.0201260  0.0061130   3.292  0.02297 *  
## t-SC-s - t-SC-m == 0  0.0045231  0.0057223   0.790  0.99360    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## (Adjusted p values reported -- single-step method)

For pairwise multiple comparisons post-hoc paired Tukey test was used (df = 1069). We observed significant differences between several classes (here we listed groups with only one factor changing, where interpretation is more clear):

  • c-SC-s and c-CS-s (control mice, stimulated/not stimulated to learn, injected with saline)
  • c-SC-m and c-CS-m (control mice, stimulated/not stimulated to learn, injected with memantine)
  • c-SC-s and c-SC-m (control mice, not stimulated to learn, injected with saline/memantine)
  • t-SC-s and t-CS-s (trisomy mice, stimulated/not stimulated to learn, injected with saline)
  • t-CS-s and c-CS-s (trisomy/control mice, stimulated to learn, injected with saline)
  • t-CS-m and c-CS-m (trisomy/control mice, stimulated to learn, injected with memantine)
  • t-SC-m and c-SC-m (trisomy/control mice, not stimulated to learn, injected with memantine)

BDNF is a member of the neurotrophin family of growth factors, which are related to the canonical nerve growth factor. Control mice, stimulated to learn, demonstrate the highest level of BDNF protein (Brain-derived neurotrophic factor), in spite of treatment type.

Memantine is an NMDA receptor antagonist, that works by decreasing abnormal activity in the brain. According to the article, memantine was shown to rescue performance of the Ts65Dn in several learning and memory tasks.

Our results show that learning stimulation increases BDNF level in control mice (that is in agreement with previous data), but decreases it in trisomy mice (regardless of treatment type). Interestingly, memantine decreases BDNF protein level in control mice, not stimulated to learn. Furthermore, the analysis between control and trisomy groups, stimulated to learn, have revealed that the level of BDNF protein content is smaller in trisomy mice, regardless of treatment type. However, not stimulated trisomy mice exposed to memantine treatment show increased level of BDNF in comparison with contol group.

3. Try to build a linear model, that can predict the level of ERBB4_N production by other proteins data (15 points)

data_without_na <-na.omit(data)
# remove Mouse_ID column
data_without_na_for_ERBB4 <- data_without_na[, 2:82]

# split the data into training and test set
set.seed(123)
training_samples <- 
  data_without_na_for_ERBB4$ERBB4_N %>%
  createDataPartition(p = 0.8, list = FALSE)

train_data <-
  data_without_na_for_ERBB4[training_samples, ]
test_data <-
  data_without_na_for_ERBB4[-training_samples, ]
ERBB4_pred_model <- lm(ERBB4_N ~ .,
                       data = train_data)
summary(ERBB4_pred_model)
## 
## Call:
## lm(formula = ERBB4_N ~ ., data = train_data)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0214510 -0.0028381 -0.0002578  0.0030123  0.0272713 
## 
## Coefficients: (4 not defined because of singularities)
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      0.0261944  0.0099148   2.642 0.008601 ** 
## DYRK1A_N        -0.0111829  0.0080486  -1.389 0.165560    
## ITSN1_N         -0.0106220  0.0132604  -0.801 0.423639    
## BDNF_N           0.0501510  0.0208316   2.407 0.016566 *  
## NR1_N           -0.0176050  0.0076925  -2.289 0.022680 *  
## NR2A_N          -0.0004137  0.0015023  -0.275 0.783201    
## pAKT_N           0.0932389  0.0343903   2.711 0.007024 ** 
## pBRAF_N         -0.0760286  0.0339889  -2.237 0.025905 *  
## pCAMKII_N        0.0020648  0.0010152   2.034 0.042687 *  
## pCREB_N         -0.0442114  0.0336857  -1.312 0.190197    
## pELK_N           0.0005901  0.0010747   0.549 0.583261    
## pERK_N          -0.0076356  0.0095159  -0.802 0.422845    
## pJNK_N          -0.0664517  0.0285649  -2.326 0.020554 *  
## PKCA_N          -0.0121265  0.0306691  -0.395 0.692782    
## pMEK_N          -0.0113037  0.0310430  -0.364 0.715973    
## pNR1_N          -0.0181065  0.0158934  -1.139 0.255355    
## pNR2A_N         -0.0001541  0.0086754  -0.018 0.985841    
## pNR2B_N          0.0147342  0.0078469   1.878 0.061228 .  
## pPKCAB_N         0.0032716  0.0032043   1.021 0.307925    
## pRSK_N           0.0022576  0.0169152   0.133 0.893897    
## AKT_N            0.0089051  0.0099235   0.897 0.370119    
## BRAF_N           0.0171293  0.0145427   1.178 0.239627    
## CAMKII_N        -0.0175886  0.0234728  -0.749 0.454154    
## CREB_N           0.0201630  0.0359409   0.561 0.575142    
## ELK_N            0.0030979  0.0042569   0.728 0.467244    
## ERK_N           -0.0002002  0.0032365  -0.062 0.950703    
## GSK3B_N         -0.0172864  0.0092741  -1.864 0.063140 .  
## JNK_N           -0.0158671  0.0409648  -0.387 0.698736    
## MEK_N            0.0303950  0.0262379   1.158 0.247450    
## TRKA_N           0.0105654  0.0189476   0.558 0.577454    
## RSK_N            0.0508162  0.0460985   1.102 0.271048    
## APP_N           -0.0272689  0.0228060  -1.196 0.232604    
## Bcatenin_N       0.0137664  0.0061961   2.222 0.026918 *  
## SOD1_N          -0.0054085  0.0045275  -1.195 0.233034    
## MTOR_N           0.0413274  0.0191100   2.163 0.031227 *  
## P38_N           -0.0032480  0.0152839  -0.213 0.831830    
## pMTOR_N         -0.0143343  0.0086706  -1.653 0.099156 .  
## DSCR1_N          0.0151403  0.0123787   1.223 0.222091    
## AMPKA_N          0.0513835  0.0267619   1.920 0.055642 .  
## NR2B_N           0.0106815  0.0104132   1.026 0.305690    
## pNUMB_N         -0.0529454  0.0290815  -1.821 0.069497 .  
## RAPTOR_N         0.0536374  0.0275528   1.947 0.052345 .  
## TIAM1_N         -0.0342437  0.0237459  -1.442 0.150144    
## pP70S6_N         0.0053228  0.0063682   0.836 0.403796    
## NUMB_N          -0.0314378  0.0437424  -0.719 0.472789    
## P70S6_N         -0.0043131  0.0065700  -0.656 0.511932    
## pGSK3B_N         0.1209513  0.0475316   2.545 0.011354 *  
## pPKCG_N         -0.0061634  0.0022786  -2.705 0.007155 ** 
## CDK5_N          -0.0016036  0.0102196  -0.157 0.875400    
## S6_N             0.0142276  0.0089512   1.589 0.112831    
## ADARB1_N        -0.0014272  0.0023056  -0.619 0.536279    
## AcetylH3K9_N    -0.0114964  0.0125481  -0.916 0.360178    
## RRP1_N          -0.0508979  0.0270387  -1.882 0.060585 .  
## BAX_N           -0.0458340  0.0444971  -1.030 0.303678    
## ARC_N            0.1678261  0.0778117   2.157 0.031679 *  
## nNOS_N           0.0200351  0.0345502   0.580 0.562355    
## Tau_N            0.0798833  0.0230796   3.461 0.000602 ***
## GFAP_N          -0.0422735  0.0637924  -0.663 0.507963    
## GluR3_N          0.0108325  0.0294415   0.368 0.713138    
## GluR4_N         -0.0773386  0.0432524  -1.788 0.074603 .  
## IL1B_N           0.0419986  0.0133267   3.151 0.001760 ** 
## P3525_N          0.0133468  0.0305957   0.436 0.662929    
## pCASP9_N         0.0060419  0.0038587   1.566 0.118275    
## PSD95_N          0.0190499  0.0042423   4.490 9.58e-06 ***
## SNCA_N          -0.0090130  0.0419516  -0.215 0.830012    
## Ubiquitin_N      0.0016481  0.0063138   0.261 0.794217    
## pGSK3B_Tyr216_N  0.0425853  0.0130005   3.276 0.001156 ** 
## SHH_N            0.0254733  0.0236638   1.076 0.282439    
## BAD_N           -0.1047916  0.0412644  -2.540 0.011519 *  
## BCL2_N           0.0291155  0.0335944   0.867 0.386694    
## pS6_N                   NA         NA      NA       NA    
## pCFOS_N         -0.0271498  0.0359647  -0.755 0.450800    
## SYP_N            0.0224881  0.0122682   1.833 0.067620 .  
## H3AcK18_N        0.0284484  0.0309120   0.920 0.358030    
## EGR1_N           0.0200172  0.0316568   0.632 0.527580    
## H3MeK4_N        -0.0068933  0.0262898  -0.262 0.793313    
## CaNA_N          -0.0031680  0.0044082  -0.719 0.472812    
## GenotypeTs65Dn   0.0061792  0.0055797   1.107 0.268844    
## TreatmentSaline  0.0030326  0.0039685   0.764 0.445256    
## BehaviorS/C     -0.0173670  0.0044434  -3.908 0.000111 ***
## classc-CS-s     -0.0005347  0.0049053  -0.109 0.913254    
## classc-SC-m      0.0030384  0.0048681   0.624 0.532928    
## classc-SC-s     -0.0021493  0.0082647  -0.260 0.794969    
## classt-CS-m     -0.0067418  0.0050586  -1.333 0.183453    
## classt-CS-s             NA         NA      NA       NA    
## classt-SC-m             NA         NA      NA       NA    
## classt-SC-s             NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.005755 on 361 degrees of freedom
## Multiple R-squared:  0.8636, Adjusted R-squared:  0.8326 
## F-statistic: 27.88 on 82 and 361 DF,  p-value: < 2.2e-16

Metrics look good

predictions <- ERBB4_pred_model %>%
  predict(test_data)
## Warning in predict.lm(., test_data): prediction from a rank-deficient fit may be
## misleading
# prediction error, RMSE
RMSE(predictions, test_data$ERBB4_N)
## [1] 0.00674468
# r-square
R2(predictions, test_data$ERBB4_N)
## [1] 0.8540674

Model shows rather goof performance

ERBB4 is a receptor tyrosine kinase, a member of the epidermal growth factor receptor family. It plays an essential role as cell surface receptor for neuregulins and EGF family members and regulates development of the central nervous system.

Let’s find out what are the best predictors for this target:

importance <- varImp(ERBB4_pred_model, scale=FALSE)
# summarize importance
importance %>% arrange(desc(Overall)) %>% top_n(10)
## Selecting by Overall
##                  Overall
## PSD95_N         4.490460
## BehaviorS/C     3.908485
## Tau_N           3.461210
## pGSK3B_Tyr216_N 3.275663
## IL1B_N          3.151463
## pAKT_N          2.711202
## pPKCG_N         2.704970
## pGSK3B_N        2.544651
## BAD_N           2.539517
## BDNF_N          2.407448

So, the most important interactors/effectors, that are connected with ERBB4 content are PSD95(postsynaptic density protein 95, membrane-associated guanylate kinase), behavior type (stimulated/notstimulated to learn), Tau protein (maintains the stability of microtubules in axons), pGSK3B_Tyr216 (glycogen synthase kinase-3 in phossphorylated form), IL1B (interleukin 1 beta), kinase AKT (there is a data, showing that ERBB4 activation promotes AKT signaling!) and other, including BDNF, described earlier.

4. PCA (15 points)

data_num <- 
  data_without_na %>% select_if(is.numeric)
data_pca <- rda(data_num, scale = TRUE)
head(summary(data_pca))
## 
## Call:
## rda(X = data_num, scale = TRUE) 
## 
## Partitioning of correlations:
##               Inertia Proportion
## Total              77          1
## Unconstrained      77          1
## 
## Eigenvalues, and their contribution to the correlations 
## 
## Importance of components:
##                           PC1     PC2     PC3     PC4     PC5     PC6     PC7
## Eigenvalue            23.0488 13.3587 7.58646 7.16154 3.28242 3.04234 2.36733
## Proportion Explained   0.2993  0.1735 0.09853 0.09301 0.04263 0.03951 0.03074
## Cumulative Proportion  0.2993  0.4728 0.57135 0.66436 0.70699 0.74650 0.77724
##                           PC8     PC9    PC10    PC11    PC12    PC13     PC14
## Eigenvalue            2.25528 1.83075 1.27872 1.11166 0.98313 0.86294 0.700922
## Proportion Explained  0.02929 0.02378 0.01661 0.01444 0.01277 0.01121 0.009103
## Cumulative Proportion 0.80653 0.83031 0.84691 0.86135 0.87412 0.88533 0.894428
##                           PC15     PC16     PC17     PC18     PC19     PC20
## Eigenvalue            0.611169 0.569217 0.520420 0.484876 0.418534 0.399046
## Proportion Explained  0.007937 0.007392 0.006759 0.006297 0.005436 0.005182
## Cumulative Proportion 0.902366 0.909758 0.916517 0.922814 0.928249 0.933432
##                           PC21     PC22     PC23     PC24     PC25     PC26
## Eigenvalue            0.387367 0.339991 0.319123 0.292254 0.259013 0.241727
## Proportion Explained  0.005031 0.004415 0.004144 0.003796 0.003364 0.003139
## Cumulative Proportion 0.938462 0.942878 0.947022 0.950818 0.954182 0.957321
##                           PC27    PC28     PC29     PC30     PC31     PC32
## Eigenvalue            0.216222 0.20709 0.182731 0.166772 0.150604 0.136755
## Proportion Explained  0.002808 0.00269 0.002373 0.002166 0.001956 0.001776
## Cumulative Proportion 0.960129 0.96282 0.965192 0.967358 0.969314 0.971090
##                           PC33     PC34     PC35     PC36     PC37    PC38
## Eigenvalue            0.133922 0.124826 0.123532 0.112486 0.105178 0.09622
## Proportion Explained  0.001739 0.001621 0.001604 0.001461 0.001366 0.00125
## Cumulative Proportion 0.972829 0.974450 0.976054 0.977515 0.978881 0.98013
##                           PC39     PC40     PC41     PC42      PC43      PC44
## Eigenvalue            0.091371 0.091173 0.082556 0.079101 0.0734032 0.0725823
## Proportion Explained  0.001187 0.001184 0.001072 0.001027 0.0009533 0.0009426
## Cumulative Proportion 0.981317 0.982501 0.983574 0.984601 0.9855541 0.9864968
##                            PC45      PC46      PC47      PC48      PC49
## Eigenvalue            0.0680414 0.0645015 0.0631263 0.0577969 0.0547784
## Proportion Explained  0.0008837 0.0008377 0.0008198 0.0007506 0.0007114
## Cumulative Proportion 0.9873804 0.9882181 0.9890379 0.9897885 0.9904999
##                            PC50      PC51      PC52      PC53      PC54
## Eigenvalue            0.0522431 0.0500933 0.0466117 0.0456046 0.0443441
## Proportion Explained  0.0006785 0.0006506 0.0006053 0.0005923 0.0005759
## Cumulative Proportion 0.9911784 0.9918290 0.9924343 0.9930266 0.9936025
##                            PC55      PC56      PC57      PC58      PC59
## Eigenvalue            0.0410688 0.0401500 0.0365504 0.0330063 0.0322246
## Proportion Explained  0.0005334 0.0005214 0.0004747 0.0004287 0.0004185
## Cumulative Proportion 0.9941359 0.9946573 0.9951320 0.9955606 0.9959791
##                            PC60      PC61      PC62      PC63      PC64
## Eigenvalue            0.0314064 0.0306337 0.0281387 0.0268911 0.0242670
## Proportion Explained  0.0004079 0.0003978 0.0003654 0.0003492 0.0003152
## Cumulative Proportion 0.9963870 0.9967848 0.9971503 0.9974995 0.9978147
##                            PC65      PC66      PC67     PC68      PC69
## Eigenvalue            0.0219850 0.0199656 0.0187510 0.018173 0.0167247
## Proportion Explained  0.0002855 0.0002593 0.0002435 0.000236 0.0002172
## Cumulative Proportion 0.9981002 0.9983595 0.9986030 0.998839 0.9990562
##                            PC70     PC71      PC72      PC73      PC74     PC75
## Eigenvalue            0.0139096 0.012476 0.0115619 0.0111135 0.0088829 0.008621
## Proportion Explained  0.0001806 0.000162 0.0001502 0.0001443 0.0001154 0.000112
## Cumulative Proportion 0.9992369 0.999399 0.9995490 0.9996934 0.9998087 0.999921
##                            PC76
## Eigenvalue            6.107e-03
## Proportion Explained  7.931e-05
## Cumulative Proportion 1.000e+00
## 
## Scaling 2 for species and site scores
## * Species are scaled proportional to eigenvalues
## * Sites are unscaled: weighted dispersion equal on all dimensions
## * General scaling constant of scores:  14.35194 
## 
## 
## Species scores
## 
##              PC1     PC2      PC3     PC4      PC5      PC6
## DYRK1A_N -0.3947 -1.1286  0.15966 -0.5175  0.41353 -0.31216
## ITSN1_N  -0.7147 -1.1499  0.02458 -0.3317  0.32120 -0.44469
## BDNF_N   -1.4439 -0.3316  0.13718 -0.2828  0.07774  0.02153
## NR1_N    -1.4547 -0.2890  0.38531  0.2480 -0.13354  0.07702
## NR2A_N   -1.3164 -0.3703  0.56759  0.2723 -0.02731  0.04109
## pAKT_N   -1.0192  0.8742 -0.23253 -0.6059 -0.35846  0.03465
## ....                                                       
## 
## 
## Site scores (weighted sums of species scores)
## 
##          PC1     PC2       PC3    PC4    PC5    PC6
## sit1 -0.7976 -0.5826  0.143685 0.5628 0.8218 0.3433
## sit2 -0.7412 -0.4971  0.058363 0.6059 1.0285 0.4438
## sit3 -0.8751 -0.5060  0.110153 0.4895 1.0174 0.3225
## sit4 -0.3027 -0.5902  0.194312 0.3097 0.1477 0.5615
## sit5 -0.3677 -0.4779  0.003043 0.3519 0.5272 0.5926
## sit6 -0.3706 -0.4671 -0.014077 0.4099 0.4814 0.4927
## ....
biplot(data_pca, scaling = "sites", display = "sites")

df_scores <- data.frame(data_without_na,
                        scores(data_pca,
                               display = "sites",
                               choices = c(1, 2, 3),
                               scaling = "sites"))

p_scores <- ggplot(df_scores, aes(x = PC1, y = PC2)) + 
  geom_point(aes(color = class), alpha = 0.5) +
  coord_equal(xlim = c(-1.2, 1.2),
              ylim = c(-1.2, 1.2)) +
  ggtitle(label="Ordination plot of the first two axes of PCA") +
  theme_bw()
p_scores

biplot(data_pca, scaling = "species",
       display = "species")

pca_summary <- summary(data_pca)
pca_result <- as.data.frame(pca_summary$cont)
plot_data <- as.data.frame(t(pca_result[c("Proportion Explained"),]))
plot_data$component <- rownames(plot_data)
plot_data$component <-
  gsub("importance.", "",
       as.character(plot_data$component))

plot_data <- plot_data %>%
  slice_max(`Proportion Explained`, n = 15)

ggplot(plot_data,
       aes(reorder(component, -`Proportion Explained`),
           `Proportion Explained`, fill = "pink")) +
  geom_bar(stat = "identity") +
  labs(x = "Component") + theme_bw() +
  theme(legend.title = element_blank()) +
  theme(legend.position = "none")

prin_comp <- prcomp(data_num, rank. = 3)
components <- prin_comp[["x"]]
components <- data.frame(components)
components$PC2 <- -components$PC2
components$PC3 <- -components$PC3
components = cbind(components, data_without_na$class)

explained_variance_ratio <- summary(prin_comp)[["importance"]]['Proportion of Variance', 1:3]
explained_variance_ratio <- 100 * sum(explained_variance_ratio)

tit = paste('Explained Variance', 
            explained_variance_ratio)

fig <- plot_ly(components, x = ~PC1, y = ~PC2, z = ~PC3,
               color = ~data_without_na$class,
               colors = c('#636EFA','#EF553B',
                          '#00CC96')) %>%
  add_markers(size = 12)

fig <- fig %>%
  layout(
    title = tit,
    scene = list(bgcolor = "#e5ecf6")
)

fig

5. The search for differentially expressed genes (proteins) - creative part of the task (15 points).

We will use DESeq2 analysis. Here we need to compare groups pair-wise

data_for_deseq <- data_without_na %>%
  select(!c(MouseID, Genotype,
            Behavior, Treatment, class))

data_for_deseq <- data.frame(t(data_for_deseq))

# DESeq does not work with numeric type
# so let's multiply our values and convert them to integers
data_for_deseq <- data_for_deseq * 10000000
data_for_deseq <- data_for_deseq %>%
  mutate_if(is.numeric,as.integer)
colnames(data_for_deseq) <- data_without_na$MouseID

samples = colnames(data_for_deseq)
condition = data_without_na$class
colData = data.frame(samples = samples,
                     condition = condition)
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
dds_c_SC_s = DESeq(dds_c_SC_s, test = 'Wald',
                   betaPrior = FALSE,
                   sfType = 'ratio')

dds_c_CS_s = DESeq(dds_c_CS_s, test = 'Wald',
                   betaPrior = FALSE,
                   sfType = 'ratio')

dds_t_SC_s = DESeq(dds_t_SC_s, test = 'Wald',
                   betaPrior = FALSE,
                   sfType = 'ratio')

dds_t_CS_s = DESeq(dds_t_CS_s, test = 'Wald',
                   betaPrior = FALSE,
                   sfType = 'ratio')
resultsNames(dds_t_CS_s)
## [1] "Intercept"                  "condition_c.SC.s_vs_t.CS.s"
## [3] "condition_c.CS.m_vs_t.CS.s" "condition_c.CS.s_vs_t.CS.s"
## [5] "condition_c.SC.m_vs_t.CS.s" "condition_t.CS.m_vs_t.CS.s"
## [7] "condition_t.SC.m_vs_t.CS.s" "condition_t.SC.s_vs_t.CS.s"
res_c_SC <- results(dds_c_SC_s, alpha = 0.05,
                    contrast = c("condition",
                                 "t-SC-s", "c-SC-s"))
summary(res_c_SC)
## 
## out of 77 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 26, 34%
## LFC < 0 (down)     : 27, 35%
## outliers [1]       : 0, 0%
## low counts [2]     : 0, 0%
## (mean count < 1188968)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
res_c_CS <- results(dds_c_CS_s, alpha = 0.05,
                    contrast = c("condition",
                                 "t-CS-s", "c-CS-s"))
summary(res_c_CS)
## 
## out of 77 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 18, 23%
## LFC < 0 (down)     : 25, 32%
## outliers [1]       : 0, 0%
## low counts [2]     : 0, 0%
## (mean count < 1188968)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
res_t_SC <- results(dds_t_SC_s, alpha = 0.05,
                    contrast = c("condition",
                                 "t-SC-m", "t-SC-s"))
summary(res_t_SC)
## 
## out of 77 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 24, 31%
## LFC < 0 (down)     : 21, 27%
## outliers [1]       : 0, 0%
## low counts [2]     : 0, 0%
## (mean count < 1188968)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
res_t_CS <- results(dds_t_CS_s, alpha = 0.05,
                    contrast = c("condition",
                                 "t-CS-m", "t-CS-s"))
summary(res_t_CS)
## 
## out of 77 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 28, 36%
## LFC < 0 (down)     : 21, 27%
## outliers [1]       : 0, 0%
## low counts [2]     : 0, 0%
## (mean count < 1188968)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results

Thus, the level of the following proteins is increased in trisomy mice, not stimulated to learn, comparing to control:

The level of histone AcetylH3K9 also differ in trisomy mice, stimulated to learn, comparing to control. The acetylation of this histone is associated with gene activation, however no evidence of its connection to Down syndrome emergence have been reported so far.

After injection of trisomy mice, not stimulated to learn, with memantine, pPKCG and kinase of ribosomal protein S6 were down-regulated, in contrast to the first comparison.

After injection of trisomy mice, stimulated to learn, with memantine, the level of proto-oncogene serine/threonine-protein kinase BRAF was elevated. This protein plays a role in cell growth by sending signals inside the cell promoting, among other functions, cell division.

Thank you for your time and attention, so to say! :)